-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VPlan] Emit VPVectorEndPointerRecipe for reverse interleave pointer adjustment #144864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesThis patch introduces VPReverseInterleavePtrRecipe, a new recipe that adjusts the pointer of a reverse interleave group. It takes the pointer of member 0 and the VF as operands and computes the pointer to the last vector lane. The final goal is to support EVL tail folding for interleaved accesses. Given that VPInterleaveRecipe is large and tightly coupled — combining both load and store, and embedding operations like reverse pointer adjustion (GEP), widen load/store, deinterleave/interleave, and reversal — breaking it down into smaller, dedicated recipes may allow VPlanTransforms::tryAddExplicitVectorLength to lower them into EVL-aware form more effectively. One foreseeable challenge is that VPlanTransforms::convertToConcreteRecipes currently runs after tryAddExplicitVectorLength, so decomposing VPInterleaveRecipe will likely need to happen earlier in the pipeline to be effective. Full diff: https://github.com/llvm/llvm-project/pull/144864.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e76422..ce40c6ccba92e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,6 +4256,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
+ case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f3306ad7cb8ec..daef26fe86d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
+class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
+ Type *IndexedTy;
+ unsigned Factor;
+
+public:
+ VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+ unsigned Factor, GEPNoWrapFlags GEPFlags,
+ DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
+ IndexedTy(IndexedTy), Factor(Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
+
+ VPValue *getPtr() const { return getOperand(0); }
+
+ VPValue *getVFValue() const { return getOperand(1); }
+
+ void execute(VPTransformState &State) override;
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override {
+ // TODO: Compute accurate cost after retiring the legacy cost model.
+ return 0;
+ }
+
+ VPReverseInterleavePtrRecipe *clone() override {
+ return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
+ Factor, getGEPNoWrapFlags(),
+ getDebugLoc());
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..98889cb5c520c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,9 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8d..40dde8cfaea73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
+ case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2262,6 +2263,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
+ Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
+ Type *IndexTy = Builder.getInt32Ty();
+ if (RuntimeVF->getType() != IndexTy)
+ RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
+ Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
+ Index = Builder.CreateNeg(Index);
+ Value *ReversePtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
+
+ State.set(this, ReversePtr, /*IsScalar*/ true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = reverse-interleave-ptr";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
@@ -3223,25 +3251,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform.
- if (Group->isReverse()) {
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- Value *Index =
- State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
- Index = State.Builder.CreateMul(Index,
- State.Builder.getInt32(Group->getFactor()));
- Index = State.Builder.CreateNeg(Index);
-
- bool InBounds = false;
- if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
- InBounds = Gep->isInBounds();
- ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
- }
-
State.setDebugLocFrom(getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 11f0f2a930329..6068b87663047 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2489,6 +2489,21 @@ void VPlanTransforms::createInterleaveGroups(
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
+ // If the group is reverse, adjust the index to refer to the last vector
+ // lane instead of the first. We adjust the index from the first vector
+ // lane, rather than directly getting the pointer for lane VF - 1, because
+ // the pointer operand of the interleaved access is supposed to be uniform.
+ if (IG->isReverse()) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
+ auto *ReversePtr = new VPReverseInterleavePtrRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ InsertPos->getDebugLoc());
+ ReversePtr->insertBefore(InsertPos);
+ Addr = ReversePtr;
+ }
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
VPIG->insertBefore(InsertPos);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..83f6ac223af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,6 +335,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 7e4edf739695a..0333035a4b0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
@@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
@@ -1579,8 +1579,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
@@ -1599,8 +1599,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
|
|
As mentioned in this patch, supporting interleaved access with EVL tail folding could be complex. The another simpler approach might be to directly create a VPInterleaveEVLRecipe and extract as much shared code as possible into static functions. I'd like to open a discussion to clarify which direction would be preferable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to somehow reuse VPVectorEndPointerRecipe and add a Factor field on it? The two recipes look quite similar
Mel-Chen@9e1df0f |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With the other
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be good to share the logic to determine inbounds from the GEP with similar code above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
3b1e18430853e30b8707734ad2422c4311081941
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
38e145c19ce288e932b9ad26b9fe4d26d4699877
b50ca85 to
3b1e184
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like previously the index types for interleave groups were i32 but now it's determined by datalayout because of the changes in getGEPIndexTy. I don't have much of an opinion on this, was there a reason why it was needed for this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The original implementation uses i64 as the index type in the GEP.
I think one reason getGEPIndexTy can't simply choose i32 is due to scalable VF in this case;
another reason is that we compute NumElt = Stride * CurrentPart * RunTimeVF, and if the stride is not unit stride, then we have to use the type determined by datalayout.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stride is currently always negative, right? Can we assert?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, so far it must be negative.
ef6b46a55b9517f9de79b42424f6257cff83638f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// The constant stride of the pointer computed by this recipe. | |
| /// The constant stride of the pointer computed by this recipe. The stride will be scaled by IndexedTy. |
Would be good to make clear that the stride is not scaled, but a strid ein terms of IndexedTy unless I am missing something. There might be a clearer way to express this than my suggestion above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
5a7c4bf6cf35123f17749974200a9662ee707561
3b1e184 to
5a7c4bf
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM,thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unrelated, but the test should probably not check the full debug output
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I often spend quite a bit of time manually updating this test case too.
But I'm not sure what this test was originally intended to verify — is it the generated IR, the shape of the VPlan, or something else that needs to be checked? cc. @LiqinWeng
5a7c4bf to
a3eb766
Compare
A reverse interleave access is essentially composed of multiple load/store operations with same negative stride, and their addresses are based on the last lane address of member 0 in the interleaved group.
Currently, we already have VPVectorEndPointerRecipe for computing the last lane address of consecutive reverse memory accesses. This patch extends VPVectorEndPointerRecipe to support constant stride and extracts the reverse interleave group address adjustment from VPInterleaveRecipe::execute, replacing it with a VPVectorEndPointerRecipe.
The final goal is to support interleaved accesses with EVL tail folding. Given that VPInterleaveRecipe is large and tightly coupled — combining both load and store, and embedding operations like reverse pointer adjustion (GEP), widen load/store, deinterleave/interleave, and reversal — breaking it down into smaller, dedicated recipes may allow VPlanTransforms::tryAddExplicitVectorLength to lower them into EVL-aware form more effectively.
One foreseeable challenge is that VPlanTransforms::convertToConcreteRecipes currently runs after tryAddExplicitVectorLength, so decomposing VPInterleaveRecipe will likely need to happen earlier in the pipeline to be effective.